In [63]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
In [64]:
data=pd.read_csv("C:/Users/Rakesh/Datasets/diamonds.csv")
In [65]:
data.head()
Out[65]:
Unnamed: 0 carat cut color clarity depth table price x y z
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
In [66]:
data.shape
Out[66]:
(53940, 11)
In [67]:
data=data.drop("Unnamed: 0",axis=1)
In [68]:
data.head()
Out[68]:
carat cut color clarity depth table price x y z
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
In [69]:
data.isnull().sum()
Out[69]:
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

Analyzing the relationship between carrat and price of the diamond¶

In [70]:
figure=px.scatter(data_frame=data,x='carat', y='price',size='depth', color='cut', trendline='ols')
figure.show()

We see a linear relationship between carat and price. Higher the carat, price automatically increases¶

In [71]:
data['size']=data['x']*data['y']*data['z']
In [72]:
data.head()
Out[72]:
carat cut color clarity depth table price x y z size
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 38.202030
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 34.505856
2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 38.076885
3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 46.724580
4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 51.917250

Relationship between size and price¶

In [73]:
figure= px.scatter(data_frame=data, x='size', y='price', size='size', color='cut', trendline='ols')
In [74]:
figure.show()

There is a linear relationship between price and size

In [75]:
fig=px.box(data, x='cut',y='price', color='color')
In [76]:
fig.show()

Diamond based on their clarity¶

In [77]:
fig=px.box(data_frame=data, x='cut',y='price', color='clarity')
In [78]:
fig.show()
In [79]:
correlation=data.corr()
In [80]:
print(correlation['price'].sort_values(ascending=False))
price    1.000000
carat    0.921591
size     0.902385
x        0.884435
y        0.865421
z        0.861249
table    0.127134
depth   -0.010647
Name: price, dtype: float64
In [81]:
correlation
Out[81]:
carat depth table price x y z size
carat 1.000000 0.028224 0.181618 0.921591 0.975094 0.951722 0.953387 0.976308
depth 0.028224 1.000000 -0.295779 -0.010647 -0.025289 -0.029341 0.094924 0.009157
table 0.181618 -0.295779 1.000000 0.127134 0.195344 0.183760 0.150929 0.167400
price 0.921591 -0.010647 0.127134 1.000000 0.884435 0.865421 0.861249 0.902385
x 0.975094 -0.025289 0.195344 0.884435 1.000000 0.974701 0.970772 0.956564
y 0.951722 -0.029341 0.183760 0.865421 0.974701 1.000000 0.952006 0.975143
z 0.953387 0.094924 0.150929 0.861249 0.970772 0.952006 1.000000 0.950065
size 0.976308 0.009157 0.167400 0.902385 0.956564 0.975143 0.950065 1.000000
In [82]:
print(correlation['carat'].sort_values(ascending=False))
carat    1.000000
size     0.976308
x        0.975094
z        0.953387
y        0.951722
price    0.921591
table    0.181618
depth    0.028224
Name: carat, dtype: float64

Diamond Price Prediction¶

In [83]:
data['cut']= data['cut'].map({"Ideal":1, "Premium":2,"Good":3,"Very Good":4,"Fair":5})

Splitting the data in train and test

In [84]:
import sklearn
from sklearn.model_selection import train_test_split
In [85]:
data.head()
Out[85]:
carat cut color clarity depth table price x y z size
0 0.23 1 E SI2 61.5 55.0 326 3.95 3.98 2.43 38.202030
1 0.21 2 E SI1 59.8 61.0 326 3.89 3.84 2.31 34.505856
2 0.23 3 E VS1 56.9 65.0 327 4.05 4.07 2.31 38.076885
3 0.29 2 I VS2 62.4 58.0 334 4.20 4.23 2.63 46.724580
4 0.31 3 J SI2 63.3 58.0 335 4.34 4.35 2.75 51.917250
In [86]:
x=np.array(data[["carat","cut","size"]])
y=np.array(data[["price"]])
In [87]:
xtrain,xtest,ytrain,ytest= train_test_split(x,y, test_size=0.10, random_state=42)
In [88]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()
model.fit(xtrain,ytrain)
C:\Users\Rakesh\AppData\Local\Temp\ipykernel_928\1316745596.py:3: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

Out[88]:
RandomForestRegressor()
In [91]:
print('Enter diamond details to predict price')
a=float(input("Carat Size: "))
b=int(input("Cut Type (Ideal:1,Premium:2,Good:3,Very Good:4, Fair:5)"))
c=float(input("Size: "))
features=np.array([[a,b,c]])
print("Predicted Diamond's price= ", model.predict(features))
Enter diamond details to predict price
Carat Size: 0.60
Cut Type (Ideal:1,Premium:2,Good:3,Very Good:4, Fair:5)2
Size: 40
Predicted Diamond's price=  [934.43833333]

So with the help of diamond details like carat, cut and size we can able to predict the price of the diamond

In [ ]: